Lab 11

Author

CJ

1.

library(readr)
library(plotly)
Loading required package: ggplot2

Attaching package: 'plotly'
The following object is masked from 'package:ggplot2':

    last_plot
The following object is masked from 'package:stats':

    filter
The following object is masked from 'package:graphics':

    layout
library(zoo)

Attaching package: 'zoo'
The following objects are masked from 'package:base':

    as.Date, as.Date.numeric
library(dplyr)

Attaching package: 'dplyr'
The following objects are masked from 'package:stats':

    filter, lag
The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union
Rows: 61942 Columns: 5
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr  (2): state, fips
dbl  (2): cases, deaths
date (1): date

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Rows: 52 Columns: 5
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (3): state, state_name, geo_id
dbl (2): population, pop_density

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
state_pops$abb <- state_pops$state
state_pops$state <- state_pops$state_name
state_pops$state_name <- NULL
cv_states <- merge(cv_states, state_pops, by="state")

2.

dim(cv_states)
[1] 58094     9
head(cv_states)
    state       date fips   cases deaths geo_id population pop_density abb
1 Alabama 2023-01-04   01 1587224  21263     01    4887871    96.50939  AL
2 Alabama 2020-04-25   01    6213    213     01    4887871    96.50939  AL
3 Alabama 2023-02-26   01 1638348  21400     01    4887871    96.50939  AL
4 Alabama 2022-12-03   01 1549285  21129     01    4887871    96.50939  AL
5 Alabama 2020-05-06   01    8691    343     01    4887871    96.50939  AL
6 Alabama 2021-04-21   01  524367  10807     01    4887871    96.50939  AL
tail(cv_states)
        state       date fips  cases deaths geo_id population pop_density abb
58089 Wyoming 2022-09-11   56 175290   1884     56     577737    5.950611  WY
58090 Wyoming 2022-08-21   56 173487   1871     56     577737    5.950611  WY
58091 Wyoming 2021-01-26   56  51152    596     56     577737    5.950611  WY
58092 Wyoming 2021-02-21   56  53795    662     56     577737    5.950611  WY
58093 Wyoming 2021-08-22   56  70671    809     56     577737    5.950611  WY
58094 Wyoming 2021-03-20   56  55581    693     56     577737    5.950611  WY
str(cv_states)
'data.frame':   58094 obs. of  9 variables:
 $ state      : chr  "Alabama" "Alabama" "Alabama" "Alabama" ...
 $ date       : Date, format: "2023-01-04" "2020-04-25" ...
 $ fips       : chr  "01" "01" "01" "01" ...
 $ cases      : num  1587224 6213 1638348 1549285 8691 ...
 $ deaths     : num  21263 213 21400 21129 343 ...
 $ geo_id     : chr  "01" "01" "01" "01" ...
 $ population : num  4887871 4887871 4887871 4887871 4887871 ...
 $ pop_density: num  96.5 96.5 96.5 96.5 96.5 ...
 $ abb        : chr  "AL" "AL" "AL" "AL" ...

3.

# format the date
cv_states$date <- as.Date(cv_states$date, format="%Y-%m-%d")


# format the state and state abbreviation (abb) variables
state_list <- unique(cv_states$state)
cv_states$state <- factor(cv_states$state, levels = state_list)
abb_list <- unique(cv_states$abb)
cv_states$abb <- factor(cv_states$abb, levels = abb_list)
str(cv_states)
'data.frame':   58094 obs. of  9 variables:
 $ state      : Factor w/ 52 levels "Alabama","Alaska",..: 1 1 1 1 1 1 1 1 1 1 ...
 $ date       : Date, format: "2023-01-04" "2020-04-25" ...
 $ fips       : chr  "01" "01" "01" "01" ...
 $ cases      : num  1587224 6213 1638348 1549285 8691 ...
 $ deaths     : num  21263 213 21400 21129 343 ...
 $ geo_id     : chr  "01" "01" "01" "01" ...
 $ population : num  4887871 4887871 4887871 4887871 4887871 ...
 $ pop_density: num  96.5 96.5 96.5 96.5 96.5 ...
 $ abb        : Factor w/ 52 levels "AL","AK","AZ",..: 1 1 1 1 1 1 1 1 1 1 ...
### FINISH THE CODE HERE 
# order the data first by state, second by date
cv_states = cv_states[order(cv_states$state, cv_states$date),]

# Confirm the variables are now correctly formatted
str(cv_states)
'data.frame':   58094 obs. of  9 variables:
 $ state      : Factor w/ 52 levels "Alabama","Alaska",..: 1 1 1 1 1 1 1 1 1 1 ...
 $ date       : Date, format: "2020-03-13" "2020-03-14" ...
 $ fips       : chr  "01" "01" "01" "01" ...
 $ cases      : num  6 12 23 29 39 51 78 106 131 157 ...
 $ deaths     : num  0 0 0 0 0 0 0 0 0 0 ...
 $ geo_id     : chr  "01" "01" "01" "01" ...
 $ population : num  4887871 4887871 4887871 4887871 4887871 ...
 $ pop_density: num  96.5 96.5 96.5 96.5 96.5 ...
 $ abb        : Factor w/ 52 levels "AL","AK","AZ",..: 1 1 1 1 1 1 1 1 1 1 ...
head(cv_states)
       state       date fips cases deaths geo_id population pop_density abb
1029 Alabama 2020-03-13   01     6      0     01    4887871    96.50939  AL
597  Alabama 2020-03-14   01    12      0     01    4887871    96.50939  AL
282  Alabama 2020-03-15   01    23      0     01    4887871    96.50939  AL
12   Alabama 2020-03-16   01    29      0     01    4887871    96.50939  AL
266  Alabama 2020-03-17   01    39      0     01    4887871    96.50939  AL
78   Alabama 2020-03-18   01    51      0     01    4887871    96.50939  AL
tail(cv_states)
        state       date fips  cases deaths geo_id population pop_density abb
57902 Wyoming 2023-03-18   56 185640   2009     56     577737    5.950611  WY
57916 Wyoming 2023-03-19   56 185640   2009     56     577737    5.950611  WY
57647 Wyoming 2023-03-20   56 185640   2009     56     577737    5.950611  WY
57867 Wyoming 2023-03-21   56 185800   2014     56     577737    5.950611  WY
58057 Wyoming 2023-03-22   56 185800   2014     56     577737    5.950611  WY
57812 Wyoming 2023-03-23   56 185800   2014     56     577737    5.950611  WY
# Inspect the range values for each variable. What is the date range? The range of cases and deaths?
head(cv_states)
       state       date fips cases deaths geo_id population pop_density abb
1029 Alabama 2020-03-13   01     6      0     01    4887871    96.50939  AL
597  Alabama 2020-03-14   01    12      0     01    4887871    96.50939  AL
282  Alabama 2020-03-15   01    23      0     01    4887871    96.50939  AL
12   Alabama 2020-03-16   01    29      0     01    4887871    96.50939  AL
266  Alabama 2020-03-17   01    39      0     01    4887871    96.50939  AL
78   Alabama 2020-03-18   01    51      0     01    4887871    96.50939  AL
summary(cv_states)
           state            date                fips          
 Washington   : 1158   Min.   :2020-01-21   Length:58094      
 Illinois     : 1155   1st Qu.:2020-12-06   Class :character  
 California   : 1154   Median :2021-09-11   Mode  :character  
 Arizona      : 1153   Mean   :2021-09-10                     
 Massachusetts: 1147   3rd Qu.:2022-06-17                     
 Wisconsin    : 1143   Max.   :2023-03-23                     
 (Other)      :51184                                          
     cases              deaths          geo_id            population      
 Min.   :       1   Min.   :     0   Length:58094       Min.   :  577737  
 1st Qu.:  112125   1st Qu.:  1598   Class :character   1st Qu.: 1805832  
 Median :  418120   Median :  5901   Mode  :character   Median : 4468402  
 Mean   :  947941   Mean   : 12553                      Mean   : 6397965  
 3rd Qu.: 1134318   3rd Qu.: 15952                      3rd Qu.: 7535591  
 Max.   :12169158   Max.   :104277                      Max.   :39557045  
                                                                          
  pop_density             abb       
 Min.   :    1.292   WA     : 1158  
 1st Qu.:   43.659   IL     : 1155  
 Median :  107.860   CA     : 1154  
 Mean   :  423.031   AZ     : 1153  
 3rd Qu.:  229.511   MA     : 1147  
 Max.   :11490.120   WI     : 1143  
 NA's   :1106        (Other):51184  
min(cv_states$date)
[1] "2020-01-21"
max(cv_states$date)
[1] "2023-03-23"

4.

# Add variables for new_cases and new_deaths:
for (i in 1:length(state_list)) {
  cv_subset <- subset(cv_states, state == state_list[i])
  cv_subset <- cv_subset[order(cv_subset$date),]
  
  # add starting level for new cases and deaths
  cv_subset$new_cases <- cv_subset$cases[1]
  cv_subset$new_deaths <- cv_subset$deaths[1]

  ### FINISH THE CODE HERE
  for (j in 2:nrow(cv_subset)) {
    cv_subset$new_cases[j] = cv_subset$cases[j] - cv_subset$cases[j - 1]
    cv_subset$new_deaths[j] = cv_subset$deaths[j] - cv_subset$deaths[j - 1]
  }

  # include in main dataset
  cv_states$new_cases[cv_states$state==state_list[i]] = cv_subset$new_cases
  cv_states$new_deaths[cv_states$state==state_list[i]] = cv_subset$new_deaths
}

# Focus on recent dates
cv_states <- cv_states %>% dplyr::filter(date >= "2021-06-01")

### FINISH THE CODE HERE
# Inspect outliers in new_cases using plotly
p1<-ggplot(cv_states, aes(x = date, y = new_cases, color = state)) +
  ylab("New Cases") +geom_point(size = .5, alpha = 0.5)
ggplotly(p1)
p1<-NULL # to clear from workspace

p2<-ggplot(cv_states, aes(x = date, y = new_deaths, color = state)) +
  ylab("New Deaths") + geom_point(size = .5, alpha = 0.5)
ggplotly(p2)
p2<-NULL # to clear from workspace

# set negative new case or death counts to 0
cv_states$new_cases[cv_states$new_cases<0] = 0
cv_states$new_deaths[cv_states$new_deaths<0] = 0

# Recalculate `cases` and `deaths` as cumulative sum of updated `new_cases` and `new_deaths`
for (i in 1:length(state_list)) {
  cv_subset = subset(cv_states, state == state_list[i])
  
  # add starting level for new cases and deaths
  cv_subset$cases = cv_subset$cases[1]
  cv_subset$deaths = cv_subset$deaths[1]
  
  ### FINISH CODE HERE
  for (j in 2:nrow(cv_subset)) {
    cv_subset$cases[j] = cv_subset$new_cases[j] + cv_subset$cases[j - 1]
    cv_subset$deaths[j] = cv_subset$new_deaths[j] + cv_subset$deaths[j - 1]
  }
  # include in main dataset
  cv_states$cases[cv_states$state==state_list[i]] = cv_subset$cases
  cv_states$deaths[cv_states$state==state_list[i]] = cv_subset$deaths
}

# Smooth new counts
cv_states$new_cases = zoo::rollmean(cv_states$new_cases, k=7, fill=NA,
                                    align='right') %>% round(digits = 0)
cv_states$new_deaths = zoo::rollmean(cv_states$new_deaths, k=7, fill=NA,
                                     align='right') %>% round(digits = 0)

# Inspect data again interactively
p2<-ggplot(cv_states, aes(x = date, y = new_deaths, color = state)) +
  geom_line() + geom_point(size = .5, alpha = 0.5)
ggplotly(p2)

5.

### FINISH CODE HERE
# add population normalized (by 100,000) counts for each variable
cv_states$per100k =  as.numeric(format(round(cv_states$cases/(cv_states$population/100000),1),nsmall=1))
cv_states$newper100k =  as.numeric(format(round(cv_states$new_cases/(cv_states$population/100000),1),nsmall=1))
Warning: NAs introduced by coercion
cv_states$deathsper100k =  as.numeric(format(round(cv_states$deaths/(cv_states$population/100000),1),nsmall=1))
cv_states$newdeathsper100k =  as.numeric(format(round(cv_states$new_deaths/(cv_states$population/100000),1),nsmall=1))
Warning: NAs introduced by coercion
# add a naive_CFR variable = deaths / cases
cv_states = cv_states %>% mutate(naive_CFR = round((deaths*100/cases),2))

# create a `cv_states_today` variable
cv_states_today = subset(cv_states, date==max(cv_states$date))

6.

# pop_density vs. cases
cv_states_today %>% 
  plot_ly(x = ~pop_density, y = ~cases, 
          type = 'scatter', mode = 'markers', color = ~state,
          size = ~population, sizes = c(5, 70), marker = list(sizemode='diameter', opacity=0.5))
Warning: Ignoring 1 observations
Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
Returning the palette you asked for with that many colors

Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
Returning the palette you asked for with that many colors
# filter out "District of Columbia"
cv_states_today_filter <- cv_states_today %>% filter(state!="District of Columbia")

# pop_density vs. cases after filtering
cv_states_today_filter %>% 
  plot_ly(x = ~pop_density, y = ~cases, 
          type = 'scatter', mode = 'markers', color = ~state,
          size = ~population, sizes = c(5, 70), marker = list(sizemode='diameter', opacity=0.5))
Warning: Ignoring 1 observations

Warning: n too large, allowed maximum for palette Set2 is 8
Returning the palette you asked for with that many colors

Warning: n too large, allowed maximum for palette Set2 is 8
Returning the palette you asked for with that many colors
# pop_density vs. deathsper100k
cv_states_today_filter %>% 
  plot_ly(x = ~pop_density, y = ~deathsper100k,
          type = 'scatter', mode = 'markers', color = ~state,
          size = ~population, sizes = c(5, 70), marker = list(sizemode='diameter', opacity=0.5))
Warning: Ignoring 1 observations

Warning: n too large, allowed maximum for palette Set2 is 8
Returning the palette you asked for with that many colors

Warning: n too large, allowed maximum for palette Set2 is 8
Returning the palette you asked for with that many colors
# Adding hoverinfo
cv_states_today_filter %>% 
  plot_ly(x = ~pop_density, y = ~deathsper100k,
          type = 'scatter', mode = 'markers', color = ~state,
          size = ~population, sizes = c(5, 70), marker = list(sizemode='diameter', opacity=0.5),
          hoverinfo = 'text',
          text = ~paste( paste(state, ":", sep=""), paste(" Cases per 100k: ", per100k, sep="") , 
                         paste(" Deaths per 100k: ", deathsper100k, sep=""), sep = "<br>")) %>%
  layout(title = "Population-normalized COVID-19 deaths (per 100k) vs. population density for US states",
         yaxis = list(title = "Deaths per 100k"), xaxis = list(title = "Population Density"),
         hovermode = "compare")
Warning: Ignoring 1 observations

Warning: n too large, allowed maximum for palette Set2 is 8
Returning the palette you asked for with that many colors

Warning: n too large, allowed maximum for palette Set2 is 8
Returning the palette you asked for with that many colors

7.

p <- ggplot(cv_states_today_filter, aes(x=pop_density, y=deathsper100k,
                                         size=population)) + geom_point() +
  geom_smooth()
ggplotly(p)
`geom_smooth()` using method = 'loess' and formula = 'y ~ x'
Warning: Removed 1 rows containing non-finite values (`stat_smooth()`).
Warning: The following aesthetics were dropped during statistical transformation: size
ℹ This can happen when ggplot fails to infer the correct grouping structure in
  the data.
ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
  variable into a factor?
cor(cv_states_today_filter$pop_density, cv_states_today_filter$deathsper100k,
    use="pairwise.complete.obs")
[1] 0.1560134

8.

plot_ly(cv_states, x = ~date, y = ~naive_CFR, color = ~state, type = "scatter",
        mode = "lines")
Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
Returning the palette you asked for with that many colors

Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
Returning the palette you asked for with that many colors
cv_states %>% filter(state=="Florida") %>% plot_ly(x = ~date, y = ~new_cases,
                                                   type = "scatter", mode = "lines") %>%
  add_trace(x = ~date, y = ~new_deaths, type = "scatter", mode = "lines") 

9.

library(tidyr)
cv_states_mat <- cv_states %>% select(state, date, new_cases) %>%
  dplyr::filter(date>as.Date("2021-06-15"))
cv_states_mat2 <- as.data.frame(pivot_wider(cv_states_mat, names_from = state,
                                            values_from = new_cases))
rownames(cv_states_mat2) <- cv_states_mat2$date
cv_states_mat2$date <- NULL
cv_states_mat2 <- as.matrix(cv_states_mat2)

# Create a heatmap using plot_ly()
plot_ly(x=colnames(cv_states_mat2), y=rownames(cv_states_mat2),
        z=~cv_states_mat2,
        type="heatmap",
        showscale=T)
# Repeat with newper100k
cv_states_mat <- cv_states %>% select(state, date, newper100k) %>%
  dplyr::filter(date>as.Date("2021-06-15"))
cv_states_mat2 <- as.data.frame(pivot_wider(cv_states_mat, names_from = state,
                                            values_from = newper100k))
rownames(cv_states_mat2) <- cv_states_mat2$date
cv_states_mat2$date <- NULL
cv_states_mat2 <- as.matrix(cv_states_mat2)

plot_ly(x=colnames(cv_states_mat2), y=rownames(cv_states_mat2),
        z=~cv_states_mat2,
        type="heatmap",
        showscale=T)
# Create a second heatmap after filtering to only include dates every other week
filter_dates <- seq(as.Date("2021-06-15"), as.Date("2021-11-01"), by=14)

cv_states_mat <- cv_states %>% select(state, date, newper100k) %>%
  filter(date %in% filter_dates)
cv_states_mat2 <- as.data.frame(pivot_wider(cv_states_mat, names_from = state,
                                            values_from = newper100k))
rownames(cv_states_mat2) <- cv_states_mat2$date
cv_states_mat2$date <- NULL
cv_states_mat2 <- as.matrix(cv_states_mat2)

# Create a heatmap using plot_ly()
plot_ly(x=colnames(cv_states_mat2), y=rownames(cv_states_mat2),
        z=~cv_states_mat2,
        type="heatmap",
        showscale=T)

10.

### For specified date

pick.date = "2021-10-15"

# Extract the data for each state by its abbreviation
cv_per100 <- cv_states %>% filter(date==pick.date) %>%
  select(state, abb, newper100k, cases, deaths) # select data
cv_per100$state_name <- cv_per100$state
cv_per100$state <- cv_per100$abb
cv_per100$abb <- NULL

# Create hover text
cv_per100$hover <- with(cv_per100, paste(state_name, '<br>', "Cases per 100k: ",
                                         newper100k, '<br>', "Cases: ", cases,
                                         '<br>', "Deaths: ", deaths))

# Set up mapping details
set_map_details <- list(
  scope = 'usa',
  projection = list(type = 'albers usa'),
  showlakes = TRUE,
  lakecolor = toRGB('white')
)

# Make sure both maps are on the same color scale
shadeLimit <- 125

# Create the map
fig <- plot_geo(cv_per100, locationmode = 'USA-states') %>% 
  add_trace(
    z = ~newper100k, text = ~hover, locations = ~state,
    color = ~newper100k, colors = 'Purples'
  )
fig <- fig %>% colorbar(title = paste0("Cases per 100k: ", pick.date),
                        limits = c(0,shadeLimit))
fig <- fig %>% layout(
  title = paste('Cases per 100k by State as of ', pick.date, '<br>(Hover for value)'),
  geo = set_map_details
)
fig_pick.date <- fig

#############
### Map for today's date

# Extract the data for each state by its abbreviation
cv_per100 <- cv_states_today %>%  select(state, abb, newper100k, cases, deaths) # select data
cv_per100$state_name <- cv_per100$state
cv_per100$state <- cv_per100$abb
cv_per100$abb <- NULL

# Create hover text
cv_per100$hover <- with(cv_per100, paste(state_name, '<br>', "Cases per 100k: ",
                                         newper100k, '<br>', "Cases: ", cases,
                                         '<br>', "Deaths: ", deaths))

# Set up mapping details
set_map_details <- list(
  scope = 'usa',
  projection = list(type = 'albers usa'),
  showlakes = TRUE,
  lakecolor = toRGB('white')
)

# Create the map
fig <- plot_geo(cv_per100, locationmode = 'USA-states') %>% 
  add_trace(
    z = ~newper100k, text = ~hover, locations = ~state,
    color = ~newper100k, colors = 'Purples'
  )
fig <- fig %>% colorbar(title = paste0("Cases per 100k: ", Sys.Date()), limits = c(0,shadeLimit))
fig <- fig %>% layout(
  title = paste('Cases per 100k by State as of', Sys.Date(), '<br>(Hover for value)'),
  geo = set_map_details
)
fig_Today <- fig


### Plot together 
subplot(fig_pick.date, fig_Today, nrows = 2, margin = .05)